by Sebastian Wolf
Network analysis of Steam friends
In this notebook, I study the group of heavy gamers, and their connections with other heavy gamers. Specifically, I am interested to see the size distribution of heavy gaming communties, and whether 'hub' users play more than other heavy gamers.
# Make sure script changes take effect within this session
%load_ext autoreload
%autoreload 2
# import some useful packages for this analysis, start spark session
from setup import *
spark
Open localhost:4040 to monitor the spark UI
We first load the data for heavy gamers from the analytics notebook
# load csv
heavy_gamers = pd.read_csv(os.path.join(path,'heavy_gamers.csv'))
heavy_gamers.head()
# Take only columns we need
heavy_gamers_select = heavy_gamers.loc[:,['steamid', 'playtime_forever_player', 'playtime_average_player']]
heavy_gamers_select
# Make spark df
heavy_gamers_spark = spark.createDataFrame(heavy_gamers_select)
# get list of user ids
heavy_gamers_list = [i.steamid for i in heavy_gamers_spark.select('steamid').distinct().collect()]
Load the list of friends
# load raw friends table
spark_handler = spark_df_handler()
spark_handler.load('Friends')
friends = spark_handler.dfraw['Friends']
friends.limit(5).toPandas()
# do some conversion and selection
friends = friends.select('steamid_a', 'steamid_b')
friends = friends.withColumn('steamid_a', F.col('steamid_a').cast('long'))
friends = friends.withColumn('steamid_b', F.col('steamid_b').cast('long'))
friends
# count number of raw data rows
friends.count()
# Filter for heavy gamers exclusively
friends_filter = friends.where(F.col('steamid_a').isin(heavy_gamers_list))\
.where(F.col('steamid_b').isin(heavy_gamers_list))
# Count how many frienships we have left
friends_filter.count()
Join the list of heavy friends to their playtime data
# Join spark dfs
friends_joined = heavy_gamers_spark.join(friends_filter, heavy_gamers_spark.steamid == friends_filter.steamid_a, how = 'inner')
# collect to Pandas
friends_joined_pd = friends_joined.toPandas()
# Save and read
friends_joined_pd.to_csv(os.path.join(path, 'heavy_gamer_network.csv'), index = False)
friends_joined_pd = pd.read_csv(os.path.join(path, 'heavy_gamer_network.csv'))
# drop some columns
friends_joined_pd = friends_joined_pd.drop('steamid_a', axis = 'columns')
friends_joined_pd
import networkx as nx
# Create networkx graph object from our pandas df
%%time
# Create graph
graph = nx.convert_matrix.from_pandas_edgelist(friends_joined_pd, source = 'steamid', target = 'steamid_b', edge_attr = True)
# Get some basic info about the graph
N,K = graph.order(), graph.size()
avg_deg = float(K)/N
print("Nodes: ", N)
print("Edges: ", K)
print("Average degree: ", round(avg_deg,2))
# create a histogram of all nodes' degrees
import collections
degree = sorted([d for n, d in graph.degree()]) # degree sequenc
counts, bin_edges = np.histogram(degree, 10)
bin_centres = (bin_edges[:-1] + bin_edges[1:])/2.
fig, axes = plt.subplots(nrows=1,ncols=1, figsize = (15, 10))
plt.plot(bin_centres, counts, 'bo-')
plt.title("Degree Histogram Heavy Gamers")
plt.ylabel("Number of Nodes")
plt.xlabel("Degree")
plt.grid(True)
%%time
# Create the layout for our graph
pos = nx.spring_layout(graph, iterations = 50, k = 0.03)
# Draw the graph
%%time
plt.figure(figsize=(25,25))
node_sizes = [v*15 for k, v in graph.degree()]
options = {
'node_size' : node_sizes,
'width': 0.5,
'alpha' : 1,
'with_labels': False,
}
nx.draw(graph,
pos,
**options)
plt.title("Heavy gamer network")
# Make the labels for the interactive graph
labels = []
for k in graph.nodes:
labels.append('Average Playtime: ' + str(round(friends_joined_pd[friends_joined_pd.steamid == k].playtime_average_player.iloc[0],1)) + \
'\n Total Playtime: ' + str(round(friends_joined_pd[friends_joined_pd.steamid == k].playtime_forever_player.iloc[0] / 60,1)))
# Make the interactive graph
import plotly.graph_objects as go
Xv=[pos[k][0] for k in graph.nodes]
Yv=[pos[k][1] for k in graph.nodes]
Xed=[]
Yed=[]
for edge in graph.edges:
Xed+=[pos[edge[0]][0],pos[edge[1]][0], None]
Yed+=[pos[edge[0]][1],pos[edge[1]][1], None]
trace1=go.Scatter(x=Xed,
y=Yed,
mode='lines',
line=dict(color='rgb(210,210,210)', width=1),
hoverinfo='none'
)
node_sizes = [v*3 for k, v in graph.degree()]
trace2=go.Scatter(x=Xv,
y=Yv,
mode='markers',
name='net',
marker=dict(symbol='circle-dot',
size=node_sizes,
color='#6959CD',
line=dict(color='rgb(50,50,50)', width=0.5)
),
text=labels,
hoverinfo='text'
)
axis=dict(showline=False,
zeroline=False,
showgrid=False,
showticklabels=False,
title=''
)
width=800
height=800
layout=go.Layout(title= "Friendship network among heavy gamers (top 1% in playtime in hours) <br> Nodesize given by degree",
font= dict(size=25),
showlegend=False,
autosize=False,
width=width,
height=height,
xaxis=go.layout.XAxis(axis),
yaxis=go.layout.YAxis(axis),
margin=go.layout.Margin(
l=40,
r=40,
b=85,
t=100,
),
hovermode='closest')
data1=[trace1, trace2]
fig1=go.Figure(data=data1, layout=layout)
fig1.update_layout(
autosize=False,
width=1500,
height=1500)
fig1.show()
# Average clustering coefficient
ccs = nx.clustering(graph)
avg_clust = sum(ccs.values()) / len(ccs)
avg_clust
%%time
# Betweenness centrality
bet_cen = nx.betweenness_centrality(graph)
# Closeness centrality
clo_cen = nx.closeness_centrality(graph)
# Eigenvector centrality
eig_cen = nx.eigenvector_centrality_numpy(graph)
# Get centralities and playtimes in matching lists
bet_cen_list = []
clo_cen_list = []
eig_cen_list = []
playtimes_list = []
for k in graph.nodes:
playtimes_list.append(friends_joined_pd[friends_joined_pd.steamid == k].playtime_average_player.iloc[0])
bet_cen_list.append(bet_cen[k])
clo_cen_list.append(clo_cen[k])
eig_cen_list.append(eig_cen[k])
# compute correlations
from scipy.stats.stats import pearsonr
print('Correlation betweeness centrality and playtime: ' + str(round(pearsonr(bet_cen_list,playtimes_list)[0],3)) + ', p-value: ' + str(round(pearsonr(bet_cen_list,playtimes_list)[1],3)))
print('Correlation closesness centrality and playtime: ' + str(round(pearsonr(clo_cen_list,playtimes_list)[0],3)) + ', p-value: ' + str(round(pearsonr(clo_cen_list,playtimes_list)[1],3)))
print('Correlation eigenvector centrality and playtime: ' + str(round(pearsonr(eig_cen_list,playtimes_list)[0],3)) + ', p-value: ' + str(round(pearsonr(eig_cen_list,playtimes_list)[1],3)))
# extract the largest connected component
largest_connected_component = max(nx.connected_components(graph), key=len)
# make it a seperate graph
largest_connected_component_graph = graph.subgraph(largest_connected_component)
# draw the subgraph
%%time
pos = nx.spring_layout(largest_connected_component_graph, iterations = 100, k = 0.03)
plt.figure(figsize=(7,7))
"""
Using the spring layout :
- k controls the distance between the nodes and varies between 0 and 1
- iterations is the number of times simulated annealing is run
default k=0.1 and iterations=50
"""
options = {
'node_size' : 30,
'width': 0.5,
'alpha' : 1,
'with_labels': False,
}
nx.draw(largest_connected_component_graph,
pos,
**options)
plt.title("Largest heavy gamer community")
Recalculate correlations between centrality measures and playtime for the subgraph:
%%time
# Betweenness centrality
bet_cen = nx.betweenness_centrality(largest_connected_component_graph)
# Closeness centrality
clo_cen = nx.closeness_centrality(largest_connected_component_graph)
# Eigenvector centrality
eig_cen = nx.eigenvector_centrality_numpy(largest_connected_component_graph)
# Get centralities and playtimes in matching lists
bet_cen_list = []
clo_cen_list = []
eig_cen_list = []
playtimes_list = []
for k in largest_connected_component_graph.nodes:
playtimes_list.append(friends_joined_pd[friends_joined_pd.steamid == k].playtime_average_player.iloc[0])
bet_cen_list.append(bet_cen[k])
clo_cen_list.append(clo_cen[k])
eig_cen_list.append(eig_cen[k])
# compute correlations
from scipy.stats.stats import pearsonr
print('Correlation betweeness centrality and playtime: ' + str(round(pearsonr(bet_cen_list,playtimes_list)[0],3)) + ', p-value: ' + str(round(pearsonr(bet_cen_list,playtimes_list)[1],3)))
print('Correlation closesness centrality and playtime: ' + str(round(pearsonr(clo_cen_list,playtimes_list)[0],3)) + ', p-value: ' + str(round(pearsonr(clo_cen_list,playtimes_list)[1],3)))
print('Correlation eigenvector centrality and playtime: ' + str(round(pearsonr(eig_cen_list,playtimes_list)[0],3)) + ', p-value: ' + str(round(pearsonr(eig_cen_list,playtimes_list)[1],3)))